{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict, Counter\n",
    "\n",
    "\n",
    "class WordCorrector:\n",
    "    def __init__(self):\n",
    "        self.docs = list()\n",
    "        self.cache = defaultdict(list)\n",
    "        self.build_cache()\n",
    "\n",
    "    def build_cache(self):\n",
    "        \"\"\"\n",
    "        构建索引\n",
    "        :return: None\n",
    "        \"\"\"\n",
    "        doc_id = 0\n",
    "        with open('words.txt', 'r') as f:\n",
    "            for word in f.readlines():\n",
    "                self.docs.append(word)\n",
    "                word = word.lower()\n",
    "                for i in range(len(word)-1):\n",
    "                    term = word[i:i+2]\n",
    "                    self.cache[term].append(doc_id)\n",
    "                doc_id += 1\n",
    "\n",
    "    def correct(self, word, limit: int = 7):\n",
    "        \"\"\"\n",
    "        在倒排索引里检索单词原型\n",
    "        :param word:\n",
    "        :param limit: 返回的匹配结果数量\n",
    "        :return:\n",
    "        \"\"\"\n",
    "        word = word.lower()\n",
    "        result_list = list()\n",
    "        for i in range(len(word)-1):\n",
    "            term = word[i:i+2]\n",
    "            result_list += self.cache.get(term, [])\n",
    "        counter = 0\n",
    "        for item in Counter(result_list).most_common():\n",
    "            similar_word = self.docs[item[0]]\n",
    "            if len(word) - 1 <= len(similar_word) <= len(word) + 1:\n",
    "                print(similar_word)\n",
    "                counter += 1\n",
    "                if counter > limit:\n",
    "                    break\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_corrector = WordCorrector()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "989"
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(word_corrector.cache)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "clove\n",
      "\n",
      "glove\n",
      "\n",
      "Love\n",
      "\n",
      "loved\n",
      "\n",
      "lovee\n",
      "\n",
      "lovey\n",
      "\n",
      "Lovel\n",
      "\n",
      "Lover\n",
      "\n"
     ]
    }
   ],
   "source": [
    "word_corrector.correct('loove')\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "language": "python",
   "display_name": "Python 3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "3.8.8-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}